{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 5,
     "status": "ok",
     "timestamp": 1649954662434,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "5OzU9RtB9fWZ",
    "outputId": "146722e2-c641-46dc-a690-01e8eed9160c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "根据本序列计算得到回报为：-2.5。\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "# 定义状态转移概率矩阵P\n",
    "P = [\n",
    "    [0.9, 0.1, 0.0, 0.0, 0.0, 0.0],\n",
    "    [0.5, 0.0, 0.5, 0.0, 0.0, 0.0],\n",
    "    [0.0, 0.0, 0.0, 0.6, 0.0, 0.4],\n",
    "    [0.0, 0.0, 0.0, 0.0, 0.3, 0.7],\n",
    "    [0.0, 0.2, 0.3, 0.5, 0.0, 0.0],\n",
    "    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],\n",
    "]\n",
    "P = np.array(P)\n",
    "\n",
    "rewards = [-1, -2, -2, 10, 1, 0]  # 定义奖励函数\n",
    "gamma = 0.5  # 定义折扣因子\n",
    "\n",
    "\n",
    "# 给定一条序列,计算从某个索引（起始状态）开始到序列最后（终止状态）得到的回报\n",
    "def compute_return(start_index, chain, gamma):\n",
    "    G = 0\n",
    "    for i in reversed(range(start_index, len(chain))):\n",
    "        G = gamma * G + rewards[chain[i] - 1]\n",
    "    return G\n",
    "\n",
    "\n",
    "# 一个状态序列,s1-s2-s3-s6\n",
    "chain = [1, 2, 3, 6]\n",
    "start_index = 0\n",
    "G = compute_return(start_index, chain, gamma)\n",
    "print(\"根据本序列计算得到回报为：%s。\" % G)\n",
    "\n",
    "# 根据本序列计算得到回报为：-2.5。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 3,
     "status": "ok",
     "timestamp": 1649954662902,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "8sywqMFs9fWd",
    "outputId": "d5c626fd-70c9-44f7-a4c3-3b112e4654c4"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MRP中每个状态价值分别为\n",
      " [[-2.01950168]\n",
      " [-2.21451846]\n",
      " [ 1.16142785]\n",
      " [10.53809283]\n",
      " [ 3.58728554]\n",
      " [ 0.        ]]\n"
     ]
    }
   ],
   "source": [
    "def compute(P, rewards, gamma, states_num):\n",
    "    ''' 利用贝尔曼方程的矩阵形式计算解析解,states_num是MRP的状态数 '''\n",
    "    rewards = np.array(rewards).reshape((-1, 1))  #将rewards写成列向量形式\n",
    "    value = np.dot(np.linalg.inv(np.eye(states_num, states_num) - gamma * P),\n",
    "                   rewards)\n",
    "    return value\n",
    "\n",
    "\n",
    "V = compute(P, rewards, gamma, 6)\n",
    "print(\"MRP中每个状态价值分别为\\n\", V)\n",
    "\n",
    "# MRP中每个状态价值分别为\n",
    "#  [[-2.01950168]\n",
    "#  [-2.21451846]\n",
    "#  [ 1.16142785]\n",
    "#  [10.53809283]\n",
    "#  [ 3.58728554]\n",
    "#  [ 0.        ]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "executionInfo": {
     "elapsed": 340,
     "status": "ok",
     "timestamp": 1649954667427,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "5ILxWaLR9fWd"
   },
   "outputs": [],
   "source": [
    "S = [\"s1\", \"s2\", \"s3\", \"s4\", \"s5\"]  # 状态集合\n",
    "A = [\"保持s1\", \"前往s1\", \"前往s2\", \"前往s3\", \"前往s4\", \"前往s5\", \"概率前往\"]  # 动作集合\n",
    "# 状态转移函数\n",
    "P = {\n",
    "    \"s1-保持s1-s1\": 1.0,\n",
    "    \"s1-前往s2-s2\": 1.0,\n",
    "    \"s2-前往s1-s1\": 1.0,\n",
    "    \"s2-前往s3-s3\": 1.0,\n",
    "    \"s3-前往s4-s4\": 1.0,\n",
    "    \"s3-前往s5-s5\": 1.0,\n",
    "    \"s4-前往s5-s5\": 1.0,\n",
    "    \"s4-概率前往-s2\": 0.2,\n",
    "    \"s4-概率前往-s3\": 0.4,\n",
    "    \"s4-概率前往-s4\": 0.4,\n",
    "}\n",
    "# 奖励函数\n",
    "R = {\n",
    "    \"s1-保持s1\": -1,\n",
    "    \"s1-前往s2\": 0,\n",
    "    \"s2-前往s1\": -1,\n",
    "    \"s2-前往s3\": -2,\n",
    "    \"s3-前往s4\": -2,\n",
    "    \"s3-前往s5\": 0,\n",
    "    \"s4-前往s5\": 10,\n",
    "    \"s4-概率前往\": 1,\n",
    "}\n",
    "gamma = 0.5  # 折扣因子\n",
    "MDP = (S, A, P, R, gamma)\n",
    "\n",
    "# 策略1,随机策略\n",
    "Pi_1 = {\n",
    "    \"s1-保持s1\": 0.5,\n",
    "    \"s1-前往s2\": 0.5,\n",
    "    \"s2-前往s1\": 0.5,\n",
    "    \"s2-前往s3\": 0.5,\n",
    "    \"s3-前往s4\": 0.5,\n",
    "    \"s3-前往s5\": 0.5,\n",
    "    \"s4-前往s5\": 0.5,\n",
    "    \"s4-概率前往\": 0.5,\n",
    "}\n",
    "# 策略2\n",
    "Pi_2 = {\n",
    "    \"s1-保持s1\": 0.6,\n",
    "    \"s1-前往s2\": 0.4,\n",
    "    \"s2-前往s1\": 0.3,\n",
    "    \"s2-前往s3\": 0.7,\n",
    "    \"s3-前往s4\": 0.5,\n",
    "    \"s3-前往s5\": 0.5,\n",
    "    \"s4-前往s5\": 0.1,\n",
    "    \"s4-概率前往\": 0.9,\n",
    "}\n",
    "\n",
    "\n",
    "# 把输入的两个字符串通过“-”连接,便于使用上述定义的P、R变量\n",
    "def join(str1, str2):\n",
    "    return str1 + '-' + str2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 3,
     "status": "ok",
     "timestamp": 1649954670178,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "juDFPGkP9fWe",
    "outputId": "20903c97-0f0e-4fb2-93e1-5b6b650fda1a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MDP中每个状态价值分别为\n",
      " [[-1.22555411]\n",
      " [-1.67666232]\n",
      " [ 0.51890482]\n",
      " [ 6.0756193 ]\n",
      " [ 0.        ]]\n"
     ]
    }
   ],
   "source": [
    "gamma = 0.5\n",
    "# 转化后的MRP的状态转移矩阵\n",
    "P_from_mdp_to_mrp = [\n",
    "    [0.5, 0.5, 0.0, 0.0, 0.0],\n",
    "    [0.5, 0.0, 0.5, 0.0, 0.0],\n",
    "    [0.0, 0.0, 0.0, 0.5, 0.5],\n",
    "    [0.0, 0.1, 0.2, 0.2, 0.5],\n",
    "    [0.0, 0.0, 0.0, 0.0, 1.0],\n",
    "]\n",
    "P_from_mdp_to_mrp = np.array(P_from_mdp_to_mrp)\n",
    "R_from_mdp_to_mrp = [-0.5, -1.5, -1.0, 5.5, 0]\n",
    "\n",
    "V = compute(P_from_mdp_to_mrp, R_from_mdp_to_mrp, gamma, 5)\n",
    "print(\"MDP中每个状态价值分别为\\n\", V)\n",
    "\n",
    "# MDP中每个状态价值分别为\n",
    "#  [[-1.22555411]\n",
    "#  [-1.67666232]\n",
    "#  [ 0.51890482]\n",
    "#  [ 6.0756193 ]\n",
    "#  [ 0.        ]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 317,
     "status": "ok",
     "timestamp": 1649954714601,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "3gKVFNen9scC",
    "outputId": "3d5b5f1b-d5d2-4a26-fde2-76843910507b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第一条序列\n",
      " [('s1', '前往s2', 0, 's2'), ('s2', '前往s3', -2, 's3'), ('s3', '前往s5', 0, 's5')]\n",
      "第二条序列\n",
      " [('s4', '概率前往', 1, 's4'), ('s4', '前往s5', 10, 's5')]\n",
      "第五条序列\n",
      " [('s2', '前往s3', -2, 's3'), ('s3', '前往s4', -2, 's4'), ('s4', '前往s5', 10, 's5')]\n"
     ]
    }
   ],
   "source": [
    "def sample(MDP, Pi, timestep_max, number):\n",
    "    ''' 采样函数,策略Pi,限制最长时间步timestep_max,总共采样序列数number '''\n",
    "    S, A, P, R, gamma = MDP\n",
    "    episodes = []\n",
    "    for _ in range(number):\n",
    "        episode = []\n",
    "        timestep = 0\n",
    "        s = S[np.random.randint(4)]  # 随机选择一个除s5以外的状态s作为起点\n",
    "        # 当前状态为终止状态或者时间步太长时,一次采样结束\n",
    "        while s != \"s5\" and timestep <= timestep_max:\n",
    "            timestep += 1\n",
    "            rand, temp = np.random.rand(), 0\n",
    "            # 在状态s下根据策略选择动作\n",
    "            for a_opt in A:\n",
    "                temp += Pi.get(join(s, a_opt), 0)\n",
    "                if temp > rand:\n",
    "                    a = a_opt\n",
    "                    r = R.get(join(s, a), 0)\n",
    "                    break\n",
    "            rand, temp = np.random.rand(), 0\n",
    "            # 根据状态转移概率得到下一个状态s_next\n",
    "            for s_opt in S:\n",
    "                temp += P.get(join(join(s, a), s_opt), 0)\n",
    "                if temp > rand:\n",
    "                    s_next = s_opt\n",
    "                    break\n",
    "            episode.append((s, a, r, s_next))  # 把（s,a,r,s_next）元组放入序列中\n",
    "            s = s_next  # s_next变成当前状态,开始接下来的循环\n",
    "        episodes.append(episode)\n",
    "    return episodes\n",
    "\n",
    "\n",
    "# 采样5次,每个序列最长不超过1000步\n",
    "episodes = sample(MDP, Pi_1, 20, 5)\n",
    "print('第一条序列\\n', episodes[0])\n",
    "print('第二条序列\\n', episodes[1])\n",
    "print('第五条序列\\n', episodes[4])\n",
    "\n",
    "# 第一条序列\n",
    "#  [('s1', '前往s2', 0, 's2'), ('s2', '前往s3', -2, 's3'), ('s3', '前往s5', 0, 's5')]\n",
    "# 第二条序列\n",
    "#  [('s4', '概率前往', 1, 's4'), ('s4', '前往s5', 10, 's5')]\n",
    "# 第五条序列\n",
    "#  [('s2', '前往s3', -2, 's3'), ('s3', '前往s4', -2, 's4'), ('s4', '前往s5', 10, 's5')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 292,
     "status": "ok",
     "timestamp": 1649954717890,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "uZR44aSO9fWf",
    "outputId": "7354c75f-1bc7-44b2-accc-85019278720d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "使用蒙特卡洛方法计算MDP的状态价值为\n",
      " {'s1': -1.228923788722258, 's2': -1.6955696284402704, 's3': 0.4823809701532294, 's4': 5.967514743019431, 's5': 0}\n"
     ]
    }
   ],
   "source": [
    "# 对所有采样序列计算所有状态的价值\n",
    "def MC(episodes, V, N, gamma):\n",
    "    for episode in episodes:\n",
    "        G = 0\n",
    "        for i in range(len(episode) - 1, -1, -1):  #一个序列从后往前计算\n",
    "            (s, a, r, s_next) = episode[i]\n",
    "            G = r + gamma * G\n",
    "            N[s] = N[s] + 1\n",
    "            V[s] = V[s] + (G - V[s]) / N[s]\n",
    "\n",
    "\n",
    "timestep_max = 20\n",
    "# 采样1000次,可以自行修改\n",
    "episodes = sample(MDP, Pi_1, timestep_max, 1000)\n",
    "gamma = 0.5\n",
    "V = {\"s1\": 0, \"s2\": 0, \"s3\": 0, \"s4\": 0, \"s5\": 0}\n",
    "N = {\"s1\": 0, \"s2\": 0, \"s3\": 0, \"s4\": 0, \"s5\": 0}\n",
    "MC(episodes, V, N, gamma)\n",
    "print(\"使用蒙特卡洛方法计算MDP的状态价值为\\n\", V)\n",
    "\n",
    "# 使用蒙特卡洛方法计算MDP的状态价值为\n",
    "#  {'s1': -1.228923788722258, 's2': -1.6955696284402704, 's3': 0.4823809701532294,\n",
    "# 's4': 5.967514743019431, 's5': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 303,
     "status": "ok",
     "timestamp": 1649954723228,
     "user": {
      "displayName": "Sam Lu",
      "userId": "15789059763790170725"
     },
     "user_tz": -480
    },
    "id": "COkP4ZDh9fWg",
    "outputId": "943a07c9-f8db-4646-841b-d2960785eb17"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.112567796310472 0.23199480615618912\n"
     ]
    }
   ],
   "source": [
    "def occupancy(episodes, s, a, timestep_max, gamma):\n",
    "    ''' 计算状态动作对（s,a）出现的频率,以此来估算策略的占用度量 '''\n",
    "    rho = 0\n",
    "    total_times = np.zeros(timestep_max)  # 记录每个时间步t各被经历过几次\n",
    "    occur_times = np.zeros(timestep_max)  # 记录(s_t,a_t)=(s,a)的次数\n",
    "    for episode in episodes:\n",
    "        for i in range(len(episode)):\n",
    "            (s_opt, a_opt, r, s_next) = episode[i]\n",
    "            total_times[i] += 1\n",
    "            if s == s_opt and a == a_opt:\n",
    "                occur_times[i] += 1\n",
    "    for i in reversed(range(timestep_max)):\n",
    "        if total_times[i]:\n",
    "            rho += gamma**i * occur_times[i] / total_times[i]\n",
    "    return (1 - gamma) * rho\n",
    "\n",
    "\n",
    "gamma = 0.5\n",
    "timestep_max = 1000\n",
    "\n",
    "episodes_1 = sample(MDP, Pi_1, timestep_max, 1000)\n",
    "episodes_2 = sample(MDP, Pi_2, timestep_max, 1000)\n",
    "rho_1 = occupancy(episodes_1, \"s4\", \"概率前往\", timestep_max, gamma)\n",
    "rho_2 = occupancy(episodes_2, \"s4\", \"概率前往\", timestep_max, gamma)\n",
    "print(rho_1, rho_2)\n",
    "\n",
    "# 0.112567796310472 0.23199480615618912"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "第3章-马尔可夫决策过程.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
